#!/bin/sh

# Run the steps to get a permuted dump file.  To generate everything, use
#
#  run-permute all
#
# Else, do one of the steps:
#
# permute = Generate permuted article table
# split = Generate split files
# sort = Sort each split file
# combine = Combine results

if [ -z "$TEXTGROUNDER_PYTHON" ]; then
  TEXTGROUNDER_PYTHON="$HOME/devel/textgrounder/python"
fi

NO_USE_PERMUTED=t
. $TEXTGROUNDER_PYTHON/config-wikigrounder

SPLIT_PREFIX="$DUMP_PREFIX-split"

PERMUTE_WIKI="$TEXTGROUNDER_PYTHON/permute_wiki.py"

PERMUTED_DUMP_FILE="$DUMP_PREFIX-permuted-pages-articles.xml.bz2"
PERMUTED_OUT_ORIG_ARTICLE_DATA_FILE="$DUMP_PREFIX-permuted-$ORIG_ARTICLE_DATA_SUFFIX"

PERMUTED_ORIG_ARTICLE_DATA_ARG="--article-data-file $PERMUTED_OUT_ORIG_ARTICLE_DATA_FILE"
SPLIT_PREFIX_ARG="--split-prefix $SPLIT_PREFIX"
NUM_SPLITS_ARG="--number-of-splits 8"

OTHEROPTS="$MAXTIME $DEBUG"

if [ -z "$*" -o "$*" = "all" ]; then
  steps="permute split sort combine"
else
  steps="$*"
fi

echo "Steps are $steps"

for step in $steps; do
echo "Executing step '$step' ..."

if [ "$step" = permute ]; then
echo "Permuting articles ..."
$PERMUTE_WIKI $ORIG_ARTICLE_DATA_ARG --mode=permute \
  $OTHEROPTS > $PERMUTED_OUT_ORIG_ARTICLE_DATA_FILE

elif [ "$step" = split ]; then
echo "Splitting dump file ..."
bzcat $IN_DUMP_FILE | $PERMUTE_WIKI --mode=split \
  $PERMUTED_ORIG_ARTICLE_DATA_ARG $SPLIT_PREFIX_ARG \
  $NUM_SPLITS_ARG $OTHEROPTS

elif [ "$step" = sort ]; then
echo "Sorting the split files ..."
for i in 0 1 2 3 4 5 6 7; do
  SPLITFILE="$SPLIT_PREFIX.$i"
  SPLITARTS="$SPLITFILE.articles"
  echo "Sorting file $SPLITFILE..."
  < $SPLITFILE $PERMUTE_WIKI -a $SPLITARTS --mode=sort > $SPLITFILE.sorted
done

elif [ "$step" = combine ]; then
splits=""
echo "Combining the files ..."
for i in 0 1 2 3 4 5 6 7; do
  splits="$splits $SPLIT_PREFIX.$i.sorted"
done
all_files="$SPLIT_PREFIX.prolog$splits $SPLIT_PREFIX.epilog"
echo "Concatenating $all_files ..."
cat $all_files | bzip2 > $PERMUTED_DUMP_FILE

else
echo "Unrecognized step $step"

fi

done
